Inherited parameter and data from the import tab.
filtered_data <- read.csv("../data/data-2023-09-11 (2).csv", header = TRUE)
selected_stats <- c("Ho", "Hs", "Ht", "Fis (W&C)", "Fst (W&C)", "Fis (Nei)", "Fst (Nei)")
n_rep = 1000
n_marker = 6
n_pop = 8
sequence_length <- length(6:11)
num_cores <- 16
library(hierfstat)
library(adegenet)
filtered_data <- data.frame(indv = paste(substr(filtered_data$Population, 1, 3),
row.names(filtered_data), sep = "."), filtered_data)
# Create mydata_genind
population <- filtered_data$Population
mydata_genind <- df2genind(X = filtered_data[, 6:11], sep = "/", ncode = 6, ind.names = filtered_data$indv,
pop = filtered_data$Population, NA.char = "0/0", ploidy = 2, type = "codom",
strata = NULL, hierarchy = NULL)
mydata_genind
mydata_hierfstat <- genind2hierfstat(mydata_genind)
# Libraries
library("poppr")
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
## This is poppr version 2.9.4. To get started, type package?poppr
## OMP parallel support: available
library("heatmaply")
## Loading required package: plotly
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust vegan
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
missing_data <- info_table(mydata_genind, plot = FALSE)
# Matrix format
mat <- as.matrix(missing_data)
# heatmap
p <- heatmaply(mat, dendrogram = "none", xlab = "", ylab = "", main = "", scale = "column",
margins = c(60, 100, 40, 20), grid_color = "white", grid_width = 1e-05, titleX = FALSE,
hide_colorbar = TRUE, branches_lwd = 0.1, label_names = c("Population", "Marker",
"Value"), fontsize_row = 8, fontsize_col = 8, labCol = colnames(mat), labRow = rownames(mat),
heatmap_layers = theme(axis.line = element_blank()))
p
library(pegas)
## Loading required package: ape
##
## Attaching package: 'ape'
## The following objects are masked from 'package:hierfstat':
##
## pcoa, varcomp
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following object is masked from 'package:ade4':
##
## amova
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:ape':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
result <- basic.stats(mydata_hierfstat) #hierfstat
df_resutl_basic <- as.data.frame(result$perloc)
# Weir and Cockrham estimates of Fstatistics - FIS and FST
result_f_stats <- Fst(as.loci(mydata_genind), na.alleles = "") #pegas
result_f_stats <- result_f_stats[, 2:3]
colnames(result_f_stats) <- c("Fst (W&C)", "Fis (W&C)")
result_f_stats <- merge(result_f_stats, df_resutl_basic, by = "row.names", all.x = TRUE)
colnames(result_f_stats)[10] <- "Fst (Nei)"
colnames(result_f_stats)[12] <- "Fis (Nei)"
result_f_stats <- result_f_stats %>%
column_to_rownames(., var = "Row.names")
result_f_stats_selec <- result_f_stats %>%
select(all_of(selected_stats))
result_f_stats_selec
library(hrbrthemes)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ lubridate 1.9.2 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::where() masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
a <- result_f_stats_selec %>%
select("Fis (W&C)")
b <- as_tibble(mat) %>%
filter(row_number() == n()) %>%
rownames_to_column %>%
gather(variable, value, -rowname) %>%
spread(rowname, value) %>%
slice(1:(n() - 1)) %>%
column_to_rownames("variable")
colnames(b) <- ("Missing %")
c <- merge(a, b, by = "row.names", all.x = TRUE) %>%
column_to_rownames("Row.names")
# Plot with linear trend
p1 <- ggplot(c, aes(x = `Missing %`, y = `Fis (W&C)`)) + geom_point() + theme_ipsum()
p1
library(ggplot2)
# compute the Gstats
result_f_stats <- result_f_stats %>%
mutate(GST = 1 - Hs/Ht)
result_f_stats <- result_f_stats %>%
mutate(`GST''` = (n_pop * (Ht - Hs))/((n_pop * Hs - Ht) * (1 - Hs)))
result_f_stats
# Plot with linear trend
p2 <- ggplot(result_f_stats, aes(x = GST, y = Hs)) + geom_point() + geom_smooth(method = lm,
color = "red", se = FALSE) + theme_ipsum()
p2
## `geom_smooth()` using formula = 'y ~ x'
library("pegas")
hw.test(as.loci(mydata_genind), B = n_rep)
## chi^2 df Pr(chi^2 >) Pr.exact
## B12 693.49866 15 0.0000000 0.000
## C07 27.78220 28 0.4760334 0.104
## D12 799.33270 66 0.0000000 0.000
## D10 492.53039 28 0.0000000 0.000
## A12 11.89948 10 0.2918403 0.225
## C03 56.70154 36 0.0153672 0.099
library(radiator)
# https://thierrygosselin.github.io/radiator/reference/genomic_converter.html
# https://thierrygosselin.github.io/radiator/articles/rad_genomics_computer_setup.html
# mydata1 <- genomic_converter(mydata_genind, parallel.core =
# parallel::detectCores() - 1, output = 'genepop', filename =
# 'mydata.genepop.txt')
library(genepop)
# https://cran.r-project.org/web/packages/genepop/genepop.pdf genepop_HW <-
# test_HW( inputFile =
# '05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt', which =
# 'Proba', outputFile = '', settingsFile = '', enumeration = FALSE,
# dememorization = 10000, batches = 20, iterations = 5000, verbose =
# interactive() )
I have found different values from Fstats.
The index of association was originally developed by A.H.D. Brown analyzing population structure of wild barley (Brown, 1980).
Ia: The index of association. p.Ia: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call. rbard: The standardized index of association. p.rd: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call.
pair.ia calculates the index of association in a pairwise manner among all loci.
method = 1: Permute Alleles This will redistribute all alleles in the sample throughout the locus. Missing data is fixed in place. This maintains allelic structure, but heterozygosity is variable.
method = 4: Multilocus Style Permutation This will shuffle the genotypes at each locus, maintaining the heterozygosity and allelic structure.
loci_pair <- pair.ia(mydata_genind, sample = n_rep, quiet = FALSE, method = 1,plot = FALSE)
loci_pair
## Ia p.Ia rbarD p.rD
## B12:C07 -0.0039 0.9901 -0.0039 0.9901
## B12:D12 0.0545 0.8317 0.0547 0.8317
## B12:D10 0.0204 0.2079 0.0204 0.2079
## B12:A12 0.0356 0.0099 0.0356 0.0099
## B12:C03 0.0531 0.1683 0.0534 0.1782
## C07:D12 -0.0233 0.8317 -0.0233 0.8317
## C07:D10 0.0172 0.4059 0.0172 0.4059
## C07:A12 0.0364 0.5941 0.0365 0.5941
## C07:C03 0.1252 0.0693 0.1254 0.0693
## D12:D10 0.0708 0.0099 0.0709 0.0198
## D12:A12 0.0244 0.1584 0.0245 0.1584
## D12:C03 -0.0135 0.6139 -0.0135 0.6139
## D10:A12 0.0170 0.3465 0.0170 0.3465
## D10:C03 0.0456 0.1089 0.0458 0.1089
## A12:C03 0.0284 0.0990 0.0287 0.0990
“Genotypes at one locus are independent from genotypes at the other locus”. For a pair of diploid loci, no assumption is made about the gametic phase in double heterozygotes. In particular, it is not inferred assuming one-locus HW equilibrium, as such equilibrium is not assumed anywhere in the formulation of the test. The test is thus one of association between diploid genotypes at both loci, sometimes described as a test of the composite linkage disequilibrium (Bruce S. Weir 1996, 126–28). Contingency tables are created for all pairs of loci in each sample, then a G test or a probability test for each table is computed for each table using the Markov chain algorithm of Raymond and Rousset (1995a). The number of switches of the algorithm is given for each table analyzed.
library(genepop)
outfile <- test_LD(inputFile = "05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt",
outputFile = "", settingsFile = "", dememorization = 10000, batches = 100, iterations = n_rep,
verbose = interactive())
readLines(outfile)[136:155]
## [1] "P-value for each locus pair across all populations"
## [2] "(Fisher's method)"
## [3] "-----------------------------------------------------"
## [4] "Locus pair Chi2 df P-Value"
## [5] "-------------------- -------- --- --------"
## [6] "A12 & B12 9.655835 16 0.883973"
## [7] "A12 & C03 22.903568 16 0.116337"
## [8] "B12 & C03 20.823728 16 0.185386"
## [9] "A12 & C07 8.267152 16 0.940512"
## [10] "B12 & C07 15.309280 16 0.502114"
## [11] "C03 & C07 23.809203 16 0.093755"
## [12] "A12 & D10 14.850121 16 0.535642"
## [13] "B12 & D10 9.224176 16 0.903895"
## [14] "C03 & D10 19.495390 16 0.243812"
## [15] "C07 & D10 12.347709 16 0.719717"
## [16] "A12 & D12 11.357848 16 0.786877"
## [17] "B12 & D12 12.699920 16 0.694559"
## [18] "C03 & D12 5.696802 16 0.991052"
## [19] "C07 & D12 31.371040 16 0.012061"
## [20] "D10 & D12 14.675705 16 0.548506"
LD2 is based on the observed frequencies of different genotypes (Schaid 2004).
mat_alleles <- filtered_data %>%
select(Population)
mat_alleles <- cbind(mat_alleles, filtered_data[, 6:11])
mat_alleles_loci <- alleles2loci(mat_alleles, ploidy = 1, population = 1, phased = FALSE)
linkage_pegas <- LD2(mat_alleles_loci)
print(linkage_pegas$T2)
## T2 df P-val
## 69.43451606 48.00000000 0.02313167
library(poppr)
library(pegas)
library(dplyr)
library(tibble)
# Permute Alleles This will redistribute all alleles in the sample throughout
# the locus. Missing data is fixed in place. #This maintains allelic structure,
# but heterozygosity is variable.
es <- replicate(n_rep, (shufflepop(mydata_genind, method = 1)))
fis_df <- numeric(sequence_length)
# for (i in 1:n_rep){ # Calculate the statistics for the i-th matrix
# result_fis_stats <- as_tibble(Fst(as.loci(es[[i]]))) %>% select('Fis' ) #
# Assign values to the data frames fis_df <- cbind(fis_df,
# result_fis_stats$Fis) }
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# Create a list to store the results
fis_list <- foreach(i = 1:n_rep, .combine = "c") %dopar% {
library(dplyr)
library(pegas)
# Calculate the statistics for the i-th matrix
result_fis_stats <- as_tibble(Fst(as.loci(es[[i]]))) %>%
select(Fis)
return(result_fis_stats$Fis)
}
# Combine the results into a matrix
fis_df <- matrix(unlist(fis_list), nrow = n_rep, byrow = TRUE)
fis_df <- t(fis_df)
# Stop the parallel processing cluster
stopCluster(cl)
# Set row names as in result_f_stats
rownames(fis_df) <- rownames(result_f_stats)
# fis_df <-fis_df[, -1]
vec <- seq(1, n_rep)
colnames(fis_df) <- vec
# Initialize an empty data frame to store the counts
fis_df_Greater <- numeric(sequence_length)
fis_df_Smaller <- numeric(sequence_length)
# Compare the values in result_f_stats[1] to result_FST for each column
for (col in colnames(fis_df)) {
greater_count <- as_tibble(result_f_stats[2] > fis_df[, col])
smaller_count <- as_tibble(result_f_stats[2] < fis_df[, col])
fis_df_Greater <- cbind(fis_df_Greater, greater_count$`Fis (W&C)`)
fis_df_Smaller <- cbind(fis_df_Smaller, smaller_count$`Fis (W&C)`)
}
# Set row names as in result_f_stats
rownames(fis_df_Smaller) <- rownames(fis_df_Greater) <- rownames(result_f_stats)
fis_df_Smaller <- fis_df_Smaller[, -1]
fis_df_Greater <- fis_df_Greater[, -1]
vec <- seq(1, n_rep)
colnames(fis_df_Smaller) <- colnames(fis_df_Greater) <- vec
fis_df_Smaller_av <- as.data.frame(fis_df_Smaller) %>%
mutate(average = rowSums(across(where(is.numeric)))/n_rep) %>%
select(average)
fis_df_Greater_av <- as.data.frame(fis_df_Greater) %>%
mutate(average = rowSums(across(where(is.numeric)))/n_rep) %>%
select(average)
rownames(fis_df_Smaller_av) <- rownames(fis_df_Greater_av) <- rownames(result_f_stats)
fis_df_sg <- merge(fis_df_Smaller_av, fis_df_Greater_av, by = "row.names", all.x = TRUE) %>%
column_to_rownames("Row.names")
colnames(fis_df_sg) <- c("smaller", "greater")
fis_df_sg_t <- fis_df_sg %>%
mutate(`Two-sided p-values` = ifelse(smaller > 0.5, 2 * (1 - smaller), 2 * smaller))
# standard deviation SE Calculate the standard deviation for each element in
# fis_df
std_dev <- as.data.frame(apply(fis_df, 1, sd))
colnames(std_dev) <- ("std_dev")
################################# Calculate SE
################################# ################################# join df
std_dev <- std_dev %>%
mutate(SE = std_dev/sqrt(n_rep) * 4)
rownames(std_dev) <- c(rownames(fis_df_sg_t))
df_merge <- merge(std_dev, fis_df_sg_t, by = "row.names", all.x = TRUE) %>%
column_to_rownames("Row.names")
df_merge <- merge(df_merge, result_f_stats[2], by = "row.names", all.x = TRUE) %>%
column_to_rownames("Row.names")
df_merge <- df_merge %>%
mutate(t = qt(1 - 0.05/2, n_pop - 1)) %>%
mutate(`95%CI_i` = `Fis (W&C)` - t * SE) %>%
mutate(`95%CI_s` = `Fis (W&C)` + t * SE) %>%
mutate()
# plot the point plot
library(ggplot2)
# Create a ggplot using df_merge
p <- ggplot(df_merge, aes(x = rownames(df_merge), y = `Fis (W&C)`)) + geom_point() +
geom_errorbar(aes(ymin = `95%CI_i`, ymax = `95%CI_s`), width = 0.2, position = position_dodge(0.05)) +
xlab("Category") + ylab("Fis (W&C)")
# Rotate X-axis labels for better readability
p + theme(axis.text.x = element_text(angle = 45, hjust = 1))
DEVELOPMENT
not ready for deployment
library(radiator)
genomic_converter(
data,
strata = NULL,
output = NULL,
filename = NULL,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
)
library(hierfstat)
write.fstat
write.struct